## Load libraries
library(DESeq2)
library(patchwork)
library(foreach)
library(doParallel)
library(tidyverse)
## Define main path
main_path <- file.path("/home/grocamora/RytenLab-Research/18-DGE_limma_dream")# here::here()
## Load helper functions
source(file.path(main_path, "R/hf_graph_and_themes.R"))
source(file.path(main_path, "R/hf_reports.R"))
source(file.path(main_path, "R/run_stat_test_for_all_cols_get_res.R"))
## Set options
options(dplyr.summarise.inform = FALSE)
options(lifecycle_verbosity = "warning")
knitr::opts_chunk$set(echo = T, warning = F, message = F,
out.width="100%", fig.align = "center", dpi = 100,
fig.width = 7.2, fig.height = 7.2)Update of Jon’s PCA and PCA corrections report. The covariates employed in this report are extracted from the script are extracted from:
/home/grocamora/RytenLab-Research/18-DGE_limma_dream/Scripts/new_covariate_correction.R
The scripts updates the previous iteration by Jon with Aine’s latest updates (as of 07/09/2023). The selected covariates are:
GC_NC_40_59 + INTRONIC_BASES + PCT_UTR_BASES + Total_Sequences + gender + tss_up_1kb_tag_pct
As mentioned, this report relies on the previous execution of the 01-Hardy_covariate_identification.R script, since it reads the output generated by it. Thus, we need to specify the path to input files, load the metadata and blacklisted genes:
# Paths
main_path <- "/home/grocamora/RytenLab-Research/18-DGE_limma_dream"
results_path <- file.path(main_path, "results/Hardy_NewCovs")
txi.salmon_path <- file.path(main_path, "data/Hardy/txi.salmon.rds")
metadata_path <- file.path(results_path, "all_brain_areas_covariates.rds")
sample_metadata_path <- file.path(main_path, "metadata/Hardy/updated_metadata_w_PMI.csv")
sample_outliers_template_path <- file.path(results_path, "sample_outliers")
bxp_outliers_path <- file.path(results_path, "bxp_outliers.csv")
blacklist_genes <- file.path(main_path, "data", "blist_genes_no_vers.csv")
gene_map_path <- file.path(main_path, "data", "gencode_txid_to_geneid.txt")
# Load data
txi.salmon <- readRDS(txi.salmon_path)
cts <- txi.salmon$counts
metadata <- loadMetadataHardy(metadata_path, sample_metadata_path)
covariates <- c("GC_NC_40_59", "gender",
"tss_up_1kb_tag_pct", "PCT_UTR_BASES",
"Total_Sequences", "INTRONIC_BASES")
# Load blacklisted genes
blacklist_genes <- vroom::vroom(blacklist_genes, col_names = T, delim = ",")
blacklist_genes <- as.vector(unlist(blacklist_genes))
gencode_txid_to_geneid <- vroom::vroom(gene_map_path) %>%
`colnames<-`(c("tx_id", "gene_id","gene_name", "description")) %>%
dplyr::mutate(tx_id = sub("\\..+", "", tx_id),
gene_id = sub("\\..+", "", gene_id))
# QC checks
all(colnames(cts) == metadata$bxp_id_full)Additionally, we apply an expression threshold in which a \(fpm>1\) in 50% of the samples is required for a gene to be employed in the analysis. We also remove the genes from the blacklisted regions.
dds <- DESeq2::DESeqDataSetFromTximport(txi = txi.salmon,
colData = metadata,
design = reformulate(c(covariates, "Group")))
# Estimate library size correction scaling factors
dds <- DESeq2::estimateSizeFactors(dds)
# Remove blacklist genes
dds <- dds[!rownames(dds) %in% blacklist_genes, ]
# Identify genes that pass expression cutoff - then filter out genes that are
# extremely lowly expressed here, the standard cut-off set is fpm>1 in 50% or
# more of samples
isexpr <- rowSums(DESeq2::fpm(dds) > 1) >= 0.5 * ncol(dds)
# Compute log2 Fragments Per Million
quantlog.counts = log2(DESeq2::fpm(dds)[isexpr, ] + 1)Added two different selection of outliers: 18 outliers vs. 7 outliers.
Added more graphs to assists with the decision of which outliers to remove.
Added WGCNA approach to the outlier removal methods.
Updated to use a total of 16 outliers.
First, we study if the covariates we selected group the samples in the PC plots. We select PCs until 85% of the variance is explained:
pca <- prcomp(t(quantlog.counts))
highest_pc <- which(summary(pca)$importance[3, ] > 0.85)[1]
pca_out <- pca$x %>%
.[, seq(highest_pc)] %>%
tibble::as_tibble(rownames = "bxp_id_full")
var_explained = pca$sdev^2 %>%
tibble::tibble(PC = factor(1:length(.)),
variance = .) %>%
dplyr::mutate(pct = variance/sum(variance)*100,
pct_cum = cumsum(pct))
var_explained_plot = var_explained %>%
dplyr::filter(PC %in% c(1:(highest_pc*2))) %>%
ggplot(aes(x = PC)) +
geom_col(aes(y = pct), color = "black", fill = ggsci::pal_jco()(3)[3]) +
geom_line(aes(y = pct_cum, group = 1)) +
geom_point(aes(y = pct_cum)) +
geom_text(aes(label = round(pct, 2), y = pct + 3)) +
scale_y_continuous(expand = expansion(), limits = c(0, 100),
breaks = c(seq(0, 100, 25), 85)) +
geom_hline(aes(yintercept = 85, linetype = "PC_limit"), linetype = "dashed",
show.legend = F) +
#scale_linetype_manual(name = "PC Threshold", values = "dashed", label = "85%") +
labs(x = "Principal component", y = "% Variance explained") +
custom_gg_theme + theme(legend.position = "right")
var_explained_plotAs shown in the figure, we only need 5 PCs to explain up to 85% of the variance.
pca_out %>%
dplyr::left_join(metadata %>%
dplyr::select(bxp_id_full, Group),
by = "bxp_id_full") %>%
kruskal.test(PC1 ~ Group, data = .)##
## Kruskal-Wallis rank sum test
##
## data: PC1 by Group
## Kruskal-Wallis chi-squared = 9.8861, df = 4, p-value = 0.04239
r <- sapply(c(covariates, "pd"), function(covariate){
cat("### ", covariate, " {-}\n", sep = "")
plot(plotPCgroup(pca_out, metadata, covariate))
cat("\n\n")
})We employ limma::removeBatchEffect to covariate correct the salmon quantifications. In this example, our treatment matrix will consist of the Group variable.
covs_df <- metadata %>%
dplyr::select(bxp_id_full, all_of(covariates), Group) %>%
dplyr::mutate(across(where(is.numeric), ~ scale(.))) %>%
tibble::column_to_rownames("bxp_id_full")
design <- model.matrix(reformulate(c(covariates, "Group")), data = covs_df)
design_treatment <- design[, grepl("(^Group)|(^\\(Intercept\\)$)", colnames(design))]
design_batch <- design[, !colnames(design) %in% colnames(design_treatment)]
design_batch_numeric <- design[, covs_df %>% dplyr::select(where(is.numeric)) %>% colnames()]
design_batch_categorical <- design[, !colnames(design) %in% c(colnames(design_treatment),
colnames(design_batch_numeric))]
# Test that rownames in design df == colnames in feature matrix
unique((quantlog.counts %>% colnames()) == (design %>% rownames()))
# Apply removeBatchEffect to correct out the effect of the covariates
quantlog.counts_covar = limma::removeBatchEffect( # based on lmFit
quantlog.counts,
covariates = design_batch, # continuous/ numerical covariates here
design = design_treatment)After the covariate correction, we process with the same PC analysis as before.
pca_adj <- prcomp(t(quantlog.counts_covar))
# pca2 <- prcomp(t(quantlog.counts), center = TRUE, scale. = FALSE)
highest_pc_adj <- which(summary(pca_adj)$importance[3, ] > 0.85)[1]
pca_adj_out <- pca_adj$x %>%
.[, seq(highest_pc_adj)] %>%
tibble::as_tibble(rownames = "bxp_id_full")
var_explained_adj = pca_adj$sdev^2 %>%
tibble::tibble(PC = factor(1:length(.)),
variance = .) %>%
dplyr::mutate(pct = variance/sum(variance)*100,
pct_cum = cumsum(pct))
var_explained_adj_plot = var_explained_adj %>%
dplyr::filter(PC %in% c(1:(highest_pc_adj*1.1))) %>%
ggplot(aes(x = PC)) +
geom_col(aes(y = pct), color = "black", fill = ggsci::pal_jco()(3)[3]) +
geom_line(aes(y = pct_cum, group = 1)) +
geom_point(aes(y = pct_cum)) +
geom_text(aes(label = round(pct, 1), y = pct + 3), size = 3) +
scale_y_continuous(expand = expansion(), limits = c(0, 100),
breaks = c(seq(0, 100, 25), 85)) +
geom_hline(aes(yintercept = 85, linetype = "PC_limit"), linetype = "dashed",
show.legend = F) +
#scale_linetype_manual(name = "PC Threshold", values = "dashed", label = "85%") +
labs(x = "Principal component", y = "% Variance explained") +
custom_gg_theme + theme(legend.position = "right")
var_explained_adj_plotThis time, we need 29 PCs to reach 85% of variance explained. Only the first 9 PCs will be shown in the following plots.
pca_adj_out %>%
dplyr::left_join(metadata %>%
dplyr::select(bxp_id_full, Group),
by = "bxp_id_full") %>%
kruskal.test(PC1 ~ Group, data = .)##
## Kruskal-Wallis rank sum test
##
## data: PC1 by Group
## Kruskal-Wallis chi-squared = 5.1531, df = 4, p-value = 0.2719
r <- sapply(c(covariates, "pd"), function(covariate){
cat("### ", covariate, " {-}\n", sep = "")
plot(plotPCgroup(pca_adj_out %>% dplyr::select(1:11), metadata, covariate))
cat("\n\n")
})